DATA PREPROCESSING¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In [2]:
df=pd.read_csv('train.csv')
df
Out[2]:
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
0 842 0 2.2 0 1 0 7 0.6 188 2 ... 20 756 2549 9 7 19 0 0 1 1
1 1021 1 0.5 1 0 1 53 0.7 136 3 ... 905 1988 2631 17 3 7 1 1 0 2
2 563 1 0.5 1 2 1 41 0.9 145 5 ... 1263 1716 2603 11 2 9 1 1 0 2
3 615 1 2.5 0 0 0 10 0.8 131 6 ... 1216 1786 2769 16 8 11 1 0 0 2
4 1821 1 1.2 0 13 1 44 0.6 141 2 ... 1208 1212 1411 8 2 15 1 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1995 794 1 0.5 1 0 1 2 0.8 106 6 ... 1222 1890 668 13 4 19 1 1 0 0
1996 1965 1 2.6 1 0 0 39 0.2 187 4 ... 915 1965 2032 11 10 16 1 1 1 2
1997 1911 0 0.9 1 1 1 36 0.7 108 8 ... 868 1632 3057 9 1 5 1 1 0 3
1998 1512 0 0.9 0 4 1 46 0.1 145 5 ... 336 670 869 18 10 19 1 1 1 0
1999 510 1 2.0 1 5 1 45 0.9 168 6 ... 483 754 3919 19 4 2 1 1 1 3

2000 rows × 21 columns

In [3]:
df.shape
Out[3]:
(2000, 21)
In [4]:
df.head()
Out[4]:
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
0 842 0 2.2 0 1 0 7 0.6 188 2 ... 20 756 2549 9 7 19 0 0 1 1
1 1021 1 0.5 1 0 1 53 0.7 136 3 ... 905 1988 2631 17 3 7 1 1 0 2
2 563 1 0.5 1 2 1 41 0.9 145 5 ... 1263 1716 2603 11 2 9 1 1 0 2
3 615 1 2.5 0 0 0 10 0.8 131 6 ... 1216 1786 2769 16 8 11 1 0 0 2
4 1821 1 1.2 0 13 1 44 0.6 141 2 ... 1208 1212 1411 8 2 15 1 1 0 1

5 rows × 21 columns

In [5]:
df.tail()
Out[5]:
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
1995 794 1 0.5 1 0 1 2 0.8 106 6 ... 1222 1890 668 13 4 19 1 1 0 0
1996 1965 1 2.6 1 0 0 39 0.2 187 4 ... 915 1965 2032 11 10 16 1 1 1 2
1997 1911 0 0.9 1 1 1 36 0.7 108 8 ... 868 1632 3057 9 1 5 1 1 0 3
1998 1512 0 0.9 0 4 1 46 0.1 145 5 ... 336 670 869 18 10 19 1 1 1 0
1999 510 1 2.0 1 5 1 45 0.9 168 6 ... 483 754 3919 19 4 2 1 1 1 3

5 rows × 21 columns

In [6]:
df.info
Out[6]:
<bound method DataFrame.info of       battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  \
0               842     0          2.2         0   1       0           7   
1              1021     1          0.5         1   0       1          53   
2               563     1          0.5         1   2       1          41   
3               615     1          2.5         0   0       0          10   
4              1821     1          1.2         0  13       1          44   
...             ...   ...          ...       ...  ..     ...         ...   
1995            794     1          0.5         1   0       1           2   
1996           1965     1          2.6         1   0       0          39   
1997           1911     0          0.9         1   1       1          36   
1998           1512     0          0.9         0   4       1          46   
1999            510     1          2.0         1   5       1          45   

      m_dep  mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  \
0       0.6        188        2  ...         20       756  2549     9     7   
1       0.7        136        3  ...        905      1988  2631    17     3   
2       0.9        145        5  ...       1263      1716  2603    11     2   
3       0.8        131        6  ...       1216      1786  2769    16     8   
4       0.6        141        2  ...       1208      1212  1411     8     2   
...     ...        ...      ...  ...        ...       ...   ...   ...   ...   
1995    0.8        106        6  ...       1222      1890   668    13     4   
1996    0.2        187        4  ...        915      1965  2032    11    10   
1997    0.7        108        8  ...        868      1632  3057     9     1   
1998    0.1        145        5  ...        336       670   869    18    10   
1999    0.9        168        6  ...        483       754  3919    19     4   

      talk_time  three_g  touch_screen  wifi  price_range  
0            19        0             0     1            1  
1             7        1             1     0            2  
2             9        1             1     0            2  
3            11        1             0     0            2  
4            15        1             1     0            1  
...         ...      ...           ...   ...          ...  
1995         19        1             1     0            0  
1996         16        1             1     1            2  
1997          5        1             1     0            3  
1998         19        1             1     1            0  
1999          2        1             1     1            3  

[2000 rows x 21 columns]>
In [7]:
df.describe()
Out[7]:
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores ... px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
count 2000.000000 2000.0000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 ... 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000
mean 1238.518500 0.4950 1.522250 0.509500 4.309500 0.521500 32.046500 0.501750 140.249000 4.520500 ... 645.108000 1251.515500 2124.213000 12.306500 5.767000 11.011000 0.761500 0.503000 0.507000 1.500000
std 439.418206 0.5001 0.816004 0.500035 4.341444 0.499662 18.145715 0.288416 35.399655 2.287837 ... 443.780811 432.199447 1084.732044 4.213245 4.356398 5.463955 0.426273 0.500116 0.500076 1.118314
min 501.000000 0.0000 0.500000 0.000000 0.000000 0.000000 2.000000 0.100000 80.000000 1.000000 ... 0.000000 500.000000 256.000000 5.000000 0.000000 2.000000 0.000000 0.000000 0.000000 0.000000
25% 851.750000 0.0000 0.700000 0.000000 1.000000 0.000000 16.000000 0.200000 109.000000 3.000000 ... 282.750000 874.750000 1207.500000 9.000000 2.000000 6.000000 1.000000 0.000000 0.000000 0.750000
50% 1226.000000 0.0000 1.500000 1.000000 3.000000 1.000000 32.000000 0.500000 141.000000 4.000000 ... 564.000000 1247.000000 2146.500000 12.000000 5.000000 11.000000 1.000000 1.000000 1.000000 1.500000
75% 1615.250000 1.0000 2.200000 1.000000 7.000000 1.000000 48.000000 0.800000 170.000000 7.000000 ... 947.250000 1633.000000 3064.500000 16.000000 9.000000 16.000000 1.000000 1.000000 1.000000 2.250000
max 1998.000000 1.0000 3.000000 1.000000 19.000000 1.000000 64.000000 1.000000 200.000000 8.000000 ... 1960.000000 1998.000000 3998.000000 19.000000 18.000000 20.000000 1.000000 1.000000 1.000000 3.000000

8 rows × 21 columns

In [8]:
df.isnull().sum().sum()
Out[8]:
0

DATA VISUALIZATION¶

PAIR PLOT¶

In [9]:
sns.pairplot(df,hue='price_range')
Out[9]:
<seaborn.axisgrid.PairGrid at 0x1df7c9805d0>
No description has been provided for this image

HISTOGRAM PLOT¶

In [10]:
plt.figure(figsize=(10,6))
df['fc'].hist(alpha=0.5,color='blue',label='Front camera')
df['pc'].hist(alpha=0.6,color='red',label='Primary camera')
plt.legend()
plt.xlabel('MegaPixels')
Out[10]:
Text(0.5, 0, 'MegaPixels')
No description has been provided for this image

BOX PLOT¶

In [11]:
sns.boxplot(x="price_range", y="battery_power", data=df)
Out[11]:
<Axes: xlabel='price_range', ylabel='battery_power'>
No description has been provided for this image

BAR PLOT¶

In [12]:
sns.barplot(x = 'price_range', y = 'ram', data =df)
Out[12]:
<Axes: xlabel='price_range', ylabel='ram'>
No description has been provided for this image

DATA SPLITTING¶

In [13]:
X=df.drop('price_range',axis=1)
y=df['price_range']

print('Shape of X = ', X.shape)
print('Shape of y = ', y.shape)
Shape of X =  (2000, 20)
Shape of y =  (2000,)
In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)
Shape of X_train =  (1600, 20)
Shape of y_train =  (1600,)
Shape of X_test =  (400, 20)
Shape of y_test =  (400,)

FEATURE SCALING¶

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train )
Out[15]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [16]:
X_test_scaled = sc.transform(X_test)

USING LINEAR REGRESSION¶

In [17]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
In [18]:
lm.fit(X_train,y_train)
Out[18]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [19]:
lm.score(X_test,y_test)
Out[19]:
0.9154254223097764

USING LOGISTIC REGRESSION¶

In [20]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()
In [21]:
lr.fit(X_train,y_train)
Out[21]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [22]:
lr.score(X_test,y_test)
Out[22]:
0.615

DECISION TREE REGRESSION¶

In [23]:
from sklearn.tree import DecisionTreeRegressor 
dt = DecisionTreeRegressor()
In [24]:
dt.fit(X_train,y_train)
Out[24]:
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [25]:
dt.score(X_test,y_test)
Out[25]:
0.82742052519072

RANDOM FOREST REGRESSION¶

In [26]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200)
In [27]:
rf.fit(X_train, y_train)
Out[27]:
RandomForestRegressor(n_estimators=200)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(n_estimators=200)
In [28]:
rf.score(X_test, y_test)
Out[28]:
0.9258929026690985
In [ ]: